Load Packages

In [1]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Lars, Ridge, ElasticNet, LassoLars, LassoLarsCV, LinearRegression
import re
from umap import UMAP
import requests
import pandas as pd
from bs4 import BeautifulSoup
import seaborn as sns
import matplotlib.pyplot as plt
import gower
import pickle
from collections import Counter
import plotly.express as px
from xgboost import XGBRFRegressor
import shap
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

# import the real estate price analytics library
from lib.real_estate_analytics_library import *
In [2]:
# optional - suppress warnings
import warnings
warnings.filterwarnings('ignore')

Scrape Property Price Data

In [3]:
# the root page link is used to generate the links for all pages
root = 'https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A%2210%22%2C%22LocationSearchString%22%3A%22Zurich%22%2C%22RootPropertyTypes%22%3A%5B%220%22%5D%2C%22PriceTo%22%3A%22-10%22%2C%22RoomsFrom%22%3A%22-10%22%2C%22Sort%22%3A%2211%22%2C%22AdAgeMax%22%3A-1%2C%22ComparisPointsMin%22%3A-1%2C%22SiteId%22%3A-1%7D&sort=11&page='
In [4]:
# Open provided link using the requests package
# get the properties in Zürich, using the Comparis link for this result
links_page = requests.get('https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A%2210%22%2C%22LocationSearchString%22%3A%22Zurich%22%2C%22RootPropertyTypes%22%3A%5B%220%22%5D%2C%22PriceTo%22%3A%22-10%22%2C%22RoomsFrom%22%3A%22-10%22%2C%22Sort%22%3A%2211%22%2C%22AdAgeMax%22%3A-1%2C%22ComparisPointsMin%22%3A-1%2C%22SiteId%22%3A-1%7D&sort=11')
In [5]:
soup = BeautifulSoup(links_page.content, 'html.parser')
In [6]:
# get the page number links
links = list(l['href'] for l in soup.find_all("a",{"class":"css-1yj1f35 excbu0j4"}))
In [7]:
# get the number of pages available for the location in question
num_pages = int(links[-2][links[-2].find('page=') + 5:]) + 1
In [8]:
# generate the list of pages that contain properties for the location in question
property_links = [root + str(i) for i in range(0, num_pages, 1)]
In [9]:
# define the root that we will comine with the property ID, giving us the page for each property
root = 'https://en.comparis.ch/immobilien/marktplatz/details/show/'
In [10]:
# define the list for storing the specific page for each property
pages = []
property_id = []

for property_link in property_links:    
    page = requests.get(property_link)

    soup = BeautifulSoup(page.content, 'html.parser')

    raw_id_list = re.findall(r'"AdId":[-+]?[0-9]+,', str(soup))

    id_list = [raw_id[raw_id.find(':') + 1:raw_id.find(',')] for raw_id in raw_id_list]

    # comine the root with the property ID, giving us the page for each property
    property_id.extend(id_list)
    pages.extend([root + i for i in id_list])
In [11]:
# get the attributes for each property from the Comparis website
properties = []

for p in pages:
    page = requests.get(p)
    soup = BeautifulSoup(page.content, 'html.parser')
    property_address = list(soup.find("h3",{"class":"text-green"}))
    property_attributes = list(soup.find("dl",{"class":"row xsmall-up-2 medium-up-3 large-up-4 attributes-grid"}).stripped_strings)
    properties.append([property_address, property_attributes])
In [12]:
# check the length of the property attributes list
len(properties)
Out[12]:
1000
In [13]:
# define the list of attributes that will be gathered from the scraped data
property_type = []
gross_rent = []
net_rent = []
living_space = []
rooms = []
floor = []
available_date = []
public_transport = []
motorway = []
shop = []
In [14]:
# flatten the property address list
property_address = [record[0][0] for record in properties]
In [15]:
# cycle through the scraped property data and separate it into attribute-based lists that will be used to 
# create a pandas DataFrame

for record in properties:
    
    try:
        property_type.append(record[1][record[1].index('Property type') + 1])
    except:
        property_type.append(None)
    
    try:
        gross_rent.append(float(record[1][record[1].index('Rent per month') + 1][4:].replace(',','')))
    except:
        gross_rent.append(None)
    
    try:
        net_rent.append(float(record[1][record[1].index('Rent per month (without charges)') + 1][4:].replace(',','')))
    except:
        net_rent.append(None)
    
    try:
        living_space.append(float(record[1][record[1].index('Living space') + 1][:-3]))
    except:
        living_space.append(None)
    
    try:
        rooms.append(get_num_rooms(record[1][record[1].index('Rooms') + 1]))
    except:
        rooms.append(None)
    
    try:
        floor.append(record[1][record[1].index('Floor') + 1])
    except:
        floor.append(None)
    
    try:
        available_date.append(record[1][record[1].index('Available') + 1])
    except:
        available_date.append(None)
    
    try:
        public_transport.append(float(record[1][record[1].index('Public transport stop') + 1][:-2]))
    except:
        public_transport.append(None)
    
    try:
        motorway.append(float(record[1][record[1].index('Motorway') + 1][:-2]))
    except:
        motorway.append(None)
    
    try:
        shop.append(float(record[1][record[1].index('Shops') + 1][:-2]))
    except:
        shop.append(None)
In [16]:
property_id = [int(i) for i in property_id]
In [17]:
property_records = pd.DataFrame(list(zip(property_id, property_address, property_type, gross_rent, net_rent, living_space, rooms, floor, available_date, public_transport, motorway, shop)), columns =['property_id', 'property_address', 'property_type', 'gross_rent', 'net_rent', 'living_space', 'rooms', 'floor', 'available_date', 'public_transport', 'motorway', 'shop'])
In [18]:
# attempt to load the previous records, and combine them with the new ones
try:
    # load the previous records
    previous_property_records = pd.read_csv('data/property_records_rent.csv')
    
    # concatenate the new records and the previous records
    property_records = pd.concat([property_records, previous_property_records], axis=0).drop(columns=['Unnamed: 0'])
    
    # drop duplicate records
    property_records = property_records.drop_duplicates(subset=['property_id'], keep='last').reset_index(drop=True)
except:
    pass
In [19]:
# show records
property_records
Out[19]:
property_id property_address property_type gross_rent net_rent living_space rooms floor available_date public_transport motorway shop
0 24639578 Carl Spittelerstr. 63, 8053 Zürich Apartment 1570.0 1300.0 67.0 3.0 None None NaN NaN 500.0
1 24639315 8053 Zürich Apartment 1570.0 1300.0 67.0 3.0 None None NaN NaN NaN
2 24639289 Gladbachstrasse 81, 8044 Zürich Other 185.0 180.0 NaN NaN None Immediately NaN NaN NaN
3 24639301 8002 Zürich Loft 4500.0 4200.0 140.0 1.0 5. floor None NaN NaN NaN
4 24639300 8057 Zürich Apartment 1772.0 1450.0 56.0 2.0 1. floor None NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
1067 24522013 Ottikerstrasse 11, 8006 Zürich Apartment 2495.0 2270.0 52.0 1.5 2. floor 01/12/2020 20.0 NaN 20.0
1068 24522004 Quellenstrasse 31, 8005 Zürich Commercial property 1480.0 NaN 40.0 NaN 2. floor NaN NaN NaN NaN
1069 24521998 Albisstrasse 114, 8038 Zürich Furnished apartment 1800.0 NaN 41.0 1.5 1. floor Immediately 50.0 700.0 200.0
1070 24521997 Wehrenbachhalde 35, 8053 Zürich Commercial property 350.0 300.0 41.0 1.0 Basement NaN NaN NaN NaN
1071 24520514 Cresta Park (Etzelstr. 49), 8038 Zürich Underground garage NaN NaN NaN NaN NaN Immediately NaN NaN NaN

1072 rows × 12 columns

In [20]:
# save the scraped property records
property_records.to_csv('data/property_records_rent.csv')

Process Data

In this section, we process the scraped web data. This involves encoding all features as the appropriate data type and performing imputation (i.e. encoding missing data points as the mean, median or mode of the existing data.

In [21]:
# load data
property_records = pd.read_csv('data/property_records_rent.csv')
In [22]:
# display the ratio of missing values for the below features

print('gross_rent:', property_records.loc[property_records['gross_rent'].isna() == True].shape[0]/property_records.shape[0])
print('living_space:', property_records.loc[property_records['living_space'].isna() == True].shape[0]/property_records.shape[0])
print('rooms:', property_records.loc[property_records['rooms'].isna() == True].shape[0]/property_records.shape[0])
print('property_address:', property_records.loc[property_records['property_address'].isna() == True].shape[0]/property_records.shape[0])
print('floor:', property_records.loc[property_records['floor'].isna() == True].shape[0]/property_records.shape[0])
print('property_type:', property_records.loc[property_records['property_type'].isna() == True].shape[0]/property_records.shape[0])
print('shop:', property_records.loc[property_records['shop'].isna() == True].shape[0]/property_records.shape[0])
print('public_transport:', property_records.loc[property_records['public_transport'].isna() == True].shape[0]/property_records.shape[0])
print('motorway:', property_records.loc[property_records['motorway'].isna() == True].shape[0]/property_records.shape[0])
gross_rent: 0.07742537313432836
living_space: 0.47574626865671643
rooms: 0.2658582089552239
property_address: 0.0
floor: 0.37779850746268656
property_type: 0.0
shop: 0.7761194029850746
public_transport: 0.7593283582089553
motorway: 0.9113805970149254
In [23]:
# process the data for use in a price prediction model, pricing analytics
property_records = process_records(property_records)
In [24]:
# save the processed property records
property_records.to_csv('data/processed_property_records_rent.csv')
In [25]:
# save the possible values for each feature
with open('data/possible_postcodes.pickle', 'wb') as handle:
    pickle.dump(list(property_records['property_postcode'].unique()), handle)

with open('data/possible_floors.pickle', 'wb') as handle:
    pickle.dump(list(property_records['floor'].unique()), handle)

with open('data/possible_types.pickle', 'wb') as handle:
    pickle.dump(list(property_records['property_type'].unique()), handle)

Model Selection and Training

In this section we will select, train and save two models - one tree-based model, and one linear regression-based model. The tree-based model will be selected because it has a lower mean absolute error, while the linear regression-based model will be used to extrapolate the price of real estate that falls outside of the range of the training data (i.e. very high-value real estate), since tree-based models cannot predict values that are higher than the highest target value in the dataset on which they are trained.

Note: the linear model assumes that there is a linear relationship between price and other features such as living space and number of rooms for larger properties outside of the dataset.

The methodology used in this Jupyter notebook assumes stability in the price data for the records that were scraped - that is, we assume that the prices did not significantly change over the time period covered by the property listings.

In [26]:
# load data
property_records = pd.read_csv('data/processed_property_records_rent.csv')
In [27]:
x = property_records[[col for col in property_records.columns if col not in ['property_id', 'gross_rent', 'net_rent', 'Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]]
y = property_records['gross_rent']
In [28]:
fig = px.scatter(property_records, x="rooms", y="gross_rent", color="property_type", title="Gross Rent vs Number of Rooms", hover_data=['property_postcode'])
fig.show()
In [29]:
fig = px.box(property_records, x="rooms", y="gross_rent", title="Gross Rent vs Number of Rooms", points=False)
fig.show()
In [30]:
fig = px.scatter(property_records, x="living_space", y="gross_rent", color="property_type", title="Gross Rent vs Living Space", hover_data=['property_postcode'])
fig.show()
In [31]:
fig = px.box(property_records, x="property_postcode", y="gross_rent", title="Gross Rent vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [32]:
fig = px.box(property_records, x="floor", y="gross_rent", title="Gross Rent vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [33]:
fig = px.box(property_records, x="property_type", y="gross_rent", title="Gross Rent vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [34]:
# scale the float features
columns = ['living_space', 'rooms', 'public_transport', 'motorway', 'shop']
scaler = StandardScaler().fit(x[columns])
scaled = scaler.transform(x[columns])
scaled = pd.DataFrame(scaled, columns=['scaled_' + column for column in columns])
x = pd.concat([x, scaled], axis=1)
In [35]:
# save the scaler model for later use
with open('data/scaler.pickle', 'wb') as handle:
    pickle.dump(scaler, handle)
In [36]:
x = x.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop'])
In [37]:
# calculate the correlation_matrix matrix of the features and the dependent variable
correlation_matrix = property_records[[col for col in property_records.columns if col not in ['net_rent', 'Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]].corr().loc[['gross_rent']].drop(['gross_rent'], axis=1)
In [38]:
# visualize the correlation_matrix matrix
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(correlation_matrix, square=True, vmin=-1, vmax=1, ax=ax, linewidths=1, xticklabels=correlation_matrix.columns, cmap="Blues")
plt.yticks(rotation=0)
plt.show()
In [39]:
get_vifs(x)
Out[39]:
[('8000', inf), ('8001', inf), ('8002', inf), ('8003', inf), ('8004', inf), ('8005', inf), ('8006', inf), ('8008', inf), ('8032', inf), ('8037', inf), ('8038', inf), ('8041', inf), ('8044', inf), ('8045', inf), ('8046', inf), ('8047', inf), ('8048', inf), ('8049', inf), ('8050', inf), ('8051', inf), ('8052', inf), ('8053', inf), ('8055', inf), ('8057', inf), ('8061', inf), ('8064', inf), ('1. floor', inf), ('10. floor', inf), ('11. floor', inf), ('12. floor', inf), ('13. floor', inf), ('14. floor', inf), ('16. floor', inf), ('17. floor', inf), ('18. floor', inf), ('19. floor', inf), ('2. floor', inf), ('3. floor', inf), ('4. floor', inf), ('5. floor', inf), ('6. floor', inf), ('7. floor', inf), ('8. floor', inf), ('9. floor', inf), ('Basement', inf), ('Ground floor', inf), ('Apartment', inf), ('Apartment with terrace', inf), ('Attic apartment', inf), ('Commercial property', inf), ('Furnished apartment', inf), ('Hobby room', inf), ('Loft', inf), ('Maisonette', inf), ('Other', inf), ('Parking space', inf), ('Parking space, garage', inf), ('Penthouse', inf), ('Shared apartment', inf), ('Single garage', inf), ('Single room', inf), ('Single-family house', inf), ('Studio', inf), ('Underground garage', inf), ('scaled_living_space', 1.5751389833355953), ('scaled_rooms', 1.4517295607650706), ('scaled_public_transport', 1.2245792246583553), ('scaled_shop', 1.1970272388668306), ('scaled_motorway', 1.095836791726966)]

The above VIFs indicate, as expected, serious multicolinearty in the data. This is because of the the one hot encoding of the categorical data. In order to fix this problem, we can eliminate a column from each of the categorical feature sets. We will select the columns below, based on their frequency in the data. This should not result in any significant loss in the performance of the model, as the removed values will still be indicated in the data (because all of the remaining columns/features will be 0 if the removed value is present). For example, if we remove the 'Apartment' encoding, then any record for an apartment will have all other property_type encoding set to 0 (e.g features such as 'Single garage' will all be equal to 0).

In [40]:
sorted(Counter(property_records['property_postcode']).items(), key=lambda v: v[1], reverse=True)
Out[40]:
[(8050, 99), (8004, 89), (8048, 68), (8008, 65), (8032, 54), (8001, 53), (8005, 53), (8002, 46), (8003, 46), (8037, 42), (8057, 41), (8006, 40), (8047, 38), (8052, 38), (8049, 35), (8046, 29), (8051, 29), (8038, 27), (8045, 24), (8044, 20), (8055, 19), (8053, 17), (8041, 11), (8064, 4), (8061, 1), (8000, 1)]
In [41]:
sorted(Counter(property_records['floor']).items(), key=lambda v: v[1], reverse=True)
Out[41]:
[('1. floor', 510), ('2. floor', 132), ('Ground floor', 101), ('3. floor', 76), ('4. floor', 56), ('Basement', 51), ('5. floor', 29), ('11. floor', 6), ('10. floor', 5), ('12. floor', 4), ('7. floor', 4), ('6. floor', 3), ('19. floor', 3), ('8. floor', 3), ('9. floor', 1), ('17. floor', 1), ('16. floor', 1), ('13. floor', 1), ('18. floor', 1), ('14. floor', 1)]
In [42]:
sorted(Counter(property_records['property_type']).items(), key=lambda v: v[1], reverse=True)
Out[42]:
[('Apartment', 453), ('Other', 137), ('Commercial property', 125), ('Underground garage', 70), ('Furnished apartment', 52), ('Parking space', 30), ('Single room', 29), ('Shared apartment', 18), ('Maisonette', 13), ('Penthouse', 12), ('Single garage', 11), ('Attic apartment', 10), ('Studio', 10), ('Parking space, garage', 7), ('Single-family house', 4), ('Loft', 3), ('Hobby room', 3), ('Apartment with terrace', 2)]
In [43]:
# define the columns that are to be eliminated from the input features to the Linear Regression model. This is to 
# eliminated multicolinearity.
eliminated_columns = ['8001', '1. floor', 'Apartment']
In [44]:
# The below VIFs for the reduced data indicate no multicolinearity.
get_vifs(x.drop(columns=eliminated_columns))
Out[44]:
[('Underground garage', 1.7697408797644398), ('Basement', 1.6781974137457543), ('scaled_living_space', 1.5749839988429584), ('scaled_rooms', 1.450115466118659), ('Other', 1.4420025594700805), ('Commercial property', 1.3846451430834887), ('8064', 1.3517593639094174), ('9. floor', 1.3377551785020325), ('8050', 1.3296225196003688), ('Ground floor', 1.2936741231195756), ('2. floor', 1.2905231524954441), ('8004', 1.2652012453867045), ('8005', 1.2646558766495186), ('Apartment with terrace', 1.2632332950637606), ('3. floor', 1.2557294490536928), ('6. floor', 1.2478399644372578), ('8048', 1.2332178637039675), ('scaled_public_transport', 1.2203146494176533), ('8008', 1.2166420890195488), ('Furnished apartment', 1.209423756204152), ('4. floor', 1.1945861841143233), ('scaled_shop', 1.1931150502457952), ('8047', 1.1824389537753943), ('8003', 1.1759517621474036), ('8002', 1.1672864330785344), ('8052', 1.158872435441161), ('8032', 1.157322828048574), ('5. floor', 1.15452853749053), ('Parking space', 1.1508404541367285), ('8041', 1.1426903284682317), ('8006', 1.1411815068219526), ('Single room', 1.1385905216650545), ('Shared apartment', 1.1351630434027735), ('Attic apartment', 1.1187497219819311), ('Maisonette', 1.1132857342646387), ('13. floor', 1.108091689681934), ('8053', 1.1063061904990954), ('8051', 1.1046737386101535), ('8037', 1.1030156038516918), ('8057', 1.0993528064159914), ('8049', 1.09351721633413), ('Hobby room', 1.0917545524841683), ('8055', 1.0888700312612556), ('scaled_motorway', 1.0882095354583967), ('8038', 1.0859439674859725), ('8045', 1.0855045427389918), ('8046', 1.083312016029401), ('Studio', 1.0771847998864235), ('Penthouse', 1.0745428761742515), ('19. floor', 1.0666201739970094), ('8. floor', 1.0654401066683852), ('11. floor', 1.0646933218586028), ('8044', 1.0639606119336849), ('10. floor', 1.0619683227019712), ('Single garage', 1.0605878390753392), ('Parking space, garage', 1.051876225987026), ('Loft', 1.0506473551786621), ('18. floor', 1.0504257225142217), ('17. floor', 1.0435498480372214), ('7. floor', 1.042035133745346), ('Single-family house', 1.0417050846739366), ('8000', 1.0383761190454186), ('12. floor', 1.0368530411242596), ('8061', 1.0227430625221305), ('14. floor', 1.0144999612735444), ('16. floor', 1.0132826664311254)]
In [45]:
# save the list of eliminated columns for later use
with open('data/eliminated_columns.pickle', 'wb') as handle:
    pickle.dump(eliminated_columns, handle)
In [46]:
# remove the outliers detected by Tukey's test - this reduced dataset will be used in the training of the linear 
# models
xe, ye = remove_outliers_tukeys_test(x.drop(columns=eliminated_columns), y)
In [47]:
# use the Gower distance to scale the data for input into UMAP dimensionality-reduction, which takes into account
# the float inputs and their interaction with the one hot-encoded data
umap_results = UMAP(n_neighbors=20).fit_transform(gower.gower_matrix(pd.concat([y, x], axis=1)))
In [48]:
outlier_indices = get_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
normal_indices = [i for i in range(0, x.shape[0], 1) if i not in outlier_indices]
In [49]:
outliers = pd.DataFrame(zip([v[0] for v in umap_results[outlier_indices]], [v[1] for v in umap_results[outlier_indices]], ['Outlier' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
In [50]:
normal = pd.DataFrame(zip([v[0] for v in umap_results[normal_indices]], [v[1] for v in umap_results[normal_indices]], ['Normal' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
In [51]:
# save the UMAP results as a pandas DataFrame
umap_data = pd.concat([normal, outliers]).reset_index(drop=True)
In [52]:
# plot the UMAP results, showing the outliers vs normal data points, based on the isolation forest model
fig = px.scatter(umap_data, x="Dimension 1", y="Dimension 2", color="Status", title="UMAP Result", hover_data=[umap_data.index.values])
fig.show()
In [53]:
# remove the outliers detected by the isolation forest - this reduced dataset will be used in the training of the 
# tree-based models
xt, yt = remove_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
In [54]:
model_types = [['Lasso', Lasso()], ['Ridge', Ridge()], ['ElasticNet', ElasticNet()], ['LassoLars', LassoLars()], ['LassoLarsCV', LassoLarsCV()], ['Lars', Lars()], ['LinearRegression', LinearRegression()]]
In [55]:
model_results = train_model(xe, ye, model_types, 5)
In [56]:
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
In [57]:
top_models
Out[57]:
[['Lasso', Lasso(alpha=0.5410184733743615, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False), 728.0918844066472, 526.3254240453502, 378.1740337464709], ['LassoLars', LassoLars(alpha=0.892577415416698, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True, fit_path=True, max_iter=500, normalize=True, positive=False, precompute='auto', verbose=False), 744.4792040548442, 530.7383707280804, 385.10351862248314], ['Ridge', Ridge(alpha=0.05165340321662827, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001), 729.054673071358, 528.808059262322, 388.9026628312811], ['ElasticNet', ElasticNet(alpha=0.002379743983777605, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False), 725.5848542908823, 523.4892240037227, 391.90970491261976], ['LassoLarsCV', LassoLarsCV(copy_X=True, cv=None, eps=2.220446049250313e-16, fit_intercept=True, max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True, positive=False, precompute='auto', verbose=False), 737.3317719514218, 532.0807996154755, 401.7272632406037]]
In [58]:
# train the best model on the expanded dataset
linear_pricing_model = model_results[0][1].fit(xe, ye)
linear_pricing_model_mae = top_models[0][4]
In [59]:
linear_pricing_model
Out[59]:
Lasso(alpha=0.5410184733743615, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
In [60]:
# save the selected model
with open('models/linear_pricing_model.pickle', 'wb') as handle:
    pickle.dump(linear_pricing_model, handle)

# save the model's MAE
with open('models/linear_pricing_model_mae.pickle', 'wb') as handle:
    pickle.dump(linear_pricing_model_mae, handle)
In [61]:
# calculate feature importances based on the regression coefficients
regression_interpretation = pd.DataFrame(sorted(list(zip(xe.columns, linear_pricing_model.coef_)), key=lambda v: abs(v[1]), reverse=False), columns=['Feature', 'Weight'])
In [62]:
# plot the regression corefficient-based feature importances
fig = px.scatter(regression_interpretation, x="Weight", y="Feature")
fig.update_yaxes(type='category')
fig.show()
In [63]:
model_types = [['XGBRFRegressor', XGBRFRegressor()], ['AdaBoostRegressor', AdaBoostRegressor()], ['RandomForestRegressor', RandomForestRegressor()], ['ExtraTreesRegressor', ExtraTreesRegressor()], ['DecisionTreeRegressor', DecisionTreeRegressor()], ['GradientBoostingRegressor', GradientBoostingRegressor()]]
In [64]:
model_results = train_model(xt, yt, model_types, 3)
In [65]:
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
In [66]:
top_models
Out[66]:
[['ExtraTreesRegressor', ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse', max_depth=17, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False), 1290.4962533537416, 619.8219226358189, 297.64252680277747], ['XGBRFRegressor', XGBRFRegressor(base_score=0.5, booster=None, colsample_bylevel=1, colsample_bynode=0.8, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints=None, learning_rate=1, max_delta_step=0, max_depth=15, min_child_weight=1, missing=nan, monotone_constraints=None, n_estimators=22, n_jobs=0, num_parallel_tree=22, objective='reg:squarederror', random_state=0, reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1, subsample=0.8, tree_method=None, validate_parameters=False, verbosity=None), 1265.3198242050778, 619.6649349211983, 326.6013590494792], ['GradientBoostingRegressor', GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=10, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=25, n_iter_no_change=None, presort='deprecated', random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), 1273.5964250058314, 625.7661015153595, 330.5397418833929], ['RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=14, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False), 1337.4030867741383, 656.2823945474651, 354.5745894387899], ['DecisionTreeRegressor', DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=19, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best'), 1362.246704219611, 690.8228969522511, 355.816448801743]]
In [67]:
# train the best model on the expanded dataset
pricing_model = top_models[0][1].fit(xt, yt)
pricing_model_mae = top_models[0][4]
In [68]:
pricing_model
Out[68]:
ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=17, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=30, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)
In [69]:
# save the selected model
with open('models/pricing_model.pickle', 'wb') as handle:
    pickle.dump(pricing_model, handle)

# save the model's MAE
with open('models/pricing_model_mae.pickle', 'wb') as handle:
    pickle.dump(pricing_model_mae, handle)
In [70]:
# calculate and show the raw SHAP values for the model
# reference: https://christophm.github.io/interpretable-ml-book/shap.html

# load JS visualization code to notebook
shap.initjs()

explainer = shap.TreeExplainer(pricing_model)
shap_values = explainer.shap_values(xt)

shap.summary_plot(shap_values, xt)
In [71]:
# show the SHAP value-based relative model feature importances
shap.summary_plot(shap_values, xt, plot_type="bar")

Predict the price of any given property

In [72]:
# show the possible values for each feature
with open('data/possible_postcodes.pickle', 'rb') as handle:
    print('Possible Postcodes =', pickle.load(handle))

print('')

with open('data/possible_floors.pickle', 'rb') as handle:
    print('Possible Floors =', pickle.load(handle))

print('')

with open('data/possible_types.pickle', 'rb') as handle:
    print('Possible Property Types =', pickle.load(handle))
Possible Postcodes = ['8053', '8044', '8002', '8057', '8046', '8008', '8050', '8038', '8006', '8055', '8048', '8045', '8037', '8032', '8004', '8049', '8001', '8047', '8005', '8052', '8051', '8064', '8003', '8041', '8061', '8000']

Possible Floors = ['1. floor', '5. floor', '4. floor', '12. floor', 'Ground floor', '3. floor', '2. floor', 'Basement', '10. floor', '11. floor', '9. floor', '7. floor', '6. floor', '17. floor', '19. floor', '16. floor', '8. floor', '13. floor', '18. floor', '14. floor']

Possible Property Types = ['Apartment', 'Other', 'Loft', 'Furnished apartment', 'Commercial property', 'Single room', 'Underground garage', 'Parking space', 'Single garage', 'Maisonette', 'Hobby room', 'Shared apartment', 'Attic apartment', 'Penthouse', 'Studio', 'Parking space, garage', 'Single-family house', 'Apartment with terrace']
In [73]:
# load data
property_records = pd.read_csv('data/processed_property_records_rent.csv')
In [74]:
# load the pre-trained models and other required data from pickle files
with open('models/pricing_model.pickle', 'rb') as handle:
    pricing_model = pickle.load(handle)

with open('models/pricing_model_mae.pickle', 'rb') as handle:
    pricing_model_mae = pickle.load(handle)

with open('models/linear_pricing_model.pickle', 'rb') as handle:
    linear_pricing_model = pickle.load(handle)

with open('models/linear_pricing_model_mae.pickle', 'rb') as handle:
    linear_pricing_model_mae = pickle.load(handle)
    
with open('data/eliminated_columns.pickle', 'rb') as handle:
    eliminated_columns = pickle.load(handle)

with open('data/scaler.pickle', 'rb') as handle:
    scaler = pickle.load(handle)
    
with open('data/encoder.pickle', 'rb') as handle:
    encoder = pickle.load(handle)
In [75]:
# define the feature values for the property
living_space = 140
rooms = 5.0
postcode = '8003'
floor = '1. floor'
property_type = 'Apartment'
public_transport = 100
motorway = 100
shop = 100
In [76]:
input_values = encode_input(living_space, rooms, postcode, floor, property_type, public_transport, motorway, shop, scaler, encoder)
In [77]:
input_values
Out[77]:
living_space rooms public_transport motorway shop property_postcode floor property_type 8000 8001 ... Single garage Single room Single-family house Studio Underground garage scaled_living_space scaled_rooms scaled_public_transport scaled_motorway scaled_shop
0 140 5.0 100 100 100 8003 1. floor Apartment 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 2.476513 2.040651 -0.810851 -4.830155 -1.4742

1 rows × 77 columns

In [78]:
# use one of: [regression_model, tree_model]
model_type = 'tree_model'
In [79]:
# calculate price
if model_type == 'regression_model':
    price = linear_pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type'] + eliminated_columns))[0]
    mae = linear_pricing_model_mae
else:
    price = pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type']))[0]
    mae = pricing_model_mae

calculated_price = pd.concat([pd.DataFrame([price], columns=['gross_rent']), input_values], axis=1)
In [80]:
calculated_price
Out[80]:
gross_rent living_space rooms public_transport motorway shop property_postcode floor property_type 8000 ... Single garage Single room Single-family house Studio Underground garage scaled_living_space scaled_rooms scaled_public_transport scaled_motorway scaled_shop
0 3843.633333 140 5.0 100 100 100 8003 1. floor Apartment 0.0 ... 0.0 0.0 0.0 0.0 0.0 2.476513 2.040651 -0.810851 -4.830155 -1.4742

1 rows × 78 columns

In [81]:
print('Predicted Price =', price, '+/-', mae, 'CHF')
print('Price Range =', price - mae, 'to', price + mae, 'CHF')
Predicted Price = 3843.633333333333 +/- 297.64252680277747 CHF
Price Range = 3545.990806530556 to 4141.275860136111 CHF
In [82]:
# the predicted price of the property is shown as a red cross, and is plotted alongside properties that are in 
# it's peer group (i.e. properties that have the same number of rooms and the same property type)
fig = px.scatter(property_records[(property_records['rooms'] == rooms) & (property_records['property_type'] == property_type)], x="living_space", y="gross_rent", color="property_type", hover_data=['living_space'])
fig1 = px.scatter(calculated_price, x="living_space", y="gross_rent", title="Calculated Price vs Peer Group", hover_data=['property_postcode'])
fig1.update_traces(marker=dict(size=10, color='Red', symbol='x'))
fig.add_trace(fig1.data[0])
fig.show()
In [ ]: